# -*- coding: utf-8 -*-
"""AI_Agent_Conference.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/16kYniBiCZxsgC9kF2g9N08iapEAKLtNC
"""

!pip install scikit-learn matplotlib seaborn

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt

# Load the generated proteins
df = pd.read_csv("proteins.csv")

# ───────────────────────────────────────────────────────
# 🧠 Add Heuristic Scoring Function
# ───────────────────────────────────────────────────────

def compute_score(row):
    score = 0
    if row["Stability"] == "Stable":
        score += 1
    score += max(0, 1 - abs(row["GRAVY"]))                  # Closer to 0 is better
    score += max(0, 1 - abs(row["pI"] - 7)) * 0.5            # pI near 7 preferred
    score += (1 - row["Instability"] / 100)                 # Lower instability
    return round(score, 3)

df["Score"] = df.apply(compute_score, axis=1)

# ───────────────────────────────────────────────────────
# 📊 Top N Candidates
# ───────────────────────────────────────────────────────

TOP_N = 10
top_proteins = df.sort_values("Score", ascending=False).head(TOP_N)

# Save CSV and FASTA
top_proteins.to_csv("top_proteins.csv", index=False)
with open("top_proteins.fasta", "w") as fasta_file:
    for _, row in top_proteins.iterrows():
        fasta_file.write(f">{row['Protein ID']}\n{row['Sequence']}\n")

print(f"\n✅ Top {TOP_N} proteins saved to 'top_proteins.csv' and 'top_proteins.fasta'.")

# ───────────────────────────────────────────────────────
# 📈 Optional: PCA + KMeans Clustering (Unsupervised ML)
# ───────────────────────────────────────────────────────

# Features for ML
features = ["MW (Da)", "pI", "GRAVY", "Aromaticity", "Instability"]
X = df[features].copy()

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df["PCA1"] = X_pca[:, 0]
df["PCA2"] = X_pca[:, 1]

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

# ───────────────────────────────────────────────────────
# 📊 Plot: Clusters and Top Proteins Highlighted
# ───────────────────────────────────────────────────────

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="Cluster", palette="Set2", s=80)
sns.scatterplot(data=top_proteins, x="PCA1", y="PCA2", color="red", s=150, marker="*", label="Top Candidates")
plt.title("Protein Clusters (PCA) with Top Candidates Highlighted")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt

# ───────────────────────────────────────────────────────
# 🧠 Heuristic Scoring Function
# ───────────────────────────────────────────────────────

def compute_score(row):
    score = 0
    if row["Stability"] == "Stable":
        score += 1
    score += max(0, 1 - abs(row["GRAVY"]))                  # Closer to 0 is better
    score += max(0, 1 - abs(row["pI"] - 7)) * 0.5            # Neutral pI is ideal
    score += (1 - row["Instability"] / 100)                 # Lower instability is better
    return round(score, 3)

# Apply scoring
df["Score"] = df.apply(compute_score, axis=1)

# ───────────────────────────────────────────────────────
# 📊 ML Features: Normalize + PCA + Clustering
# ───────────────────────────────────────────────────────

features = ["MW (Da)", "pI", "GRAVY", "Aromaticity", "Instability"]
X = df[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df["PCA1"] = X_pca[:, 0]
df["PCA2"] = X_pca[:, 1]

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

# ───────────────────────────────────────────────────────
# 🔝 Top N Selection (after PCA so PCA1/2 exist)
# ───────────────────────────────────────────────────────

TOP_N = 10
top_proteins = df.sort_values("Score", ascending=False).head(TOP_N).copy()

# Optional: Save top protein data
top_proteins.to_csv("top_proteins.csv", index=False)

with open("top_proteins.fasta", "w") as fasta_file:
    for _, row in top_proteins.iterrows():
        fasta_file.write(f">{row['Protein ID']}\n{row['Sequence']}\n")

print(f"\n✅ Top {TOP_N} proteins saved to 'top_proteins.csv' and 'top_proteins.fasta'.")

# ───────────────────────────────────────────────────────
# 📈 Plot PCA Clusters & Highlight Top Proteins
# ───────────────────────────────────────────────────────

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="Cluster", palette="Set2", s=80)
sns.scatterplot(data=top_proteins, x="PCA1", y="PCA2", color="red", s=150, marker="*", label="Top Candidates")
plt.title("Protein Clusters (PCA) with Top Candidates Highlighted")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

def updated_score(row):
    score = 0
    score += 1 if row["Stability"] == "Stable" else 0
    score += max(0, 1 - abs(row["GRAVY"]))
    score += max(0, 1 - abs(row["pI"] - 7)) * 0.5
    score += (1 - row["Instability"] / 100)
    score += 0.5 if "PB1FF56B0" in row["Protein ID"] else 0  # reward confirmed hits
    return round(score, 3)

# Apply scoring
df["Score"] = df.apply(compute_score, axis=1)

# ───────────────────────────────────────────────────────
# 📊 ML Features: Normalize + PCA + Clustering
# ───────────────────────────────────────────────────────

features = ["MW (Da)", "pI", "GRAVY", "Aromaticity", "Instability"]
X = df[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df["PCA1"] = X_pca[:, 0]
df["PCA2"] = X_pca[:, 1]

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

# ───────────────────────────────────────────────────────
# 🔝 Top N Selection (after PCA so PCA1/2 exist)
# ───────────────────────────────────────────────────────

TOP_N = 10
top_proteins = df.sort_values("Score", ascending=False).head(TOP_N).copy()

# Optional: Save top protein data
top_proteins.to_csv("top_proteins.csv", index=False)

with open("top_proteins.fasta", "w") as fasta_file:
    for _, row in top_proteins.iterrows():
        fasta_file.write(f">{row['Protein ID']}\n{row['Sequence']}\n")

print(f"\n✅ Top {TOP_N} proteins saved to 'top_proteins.csv' and 'top_proteins.fasta'.")

# ───────────────────────────────────────────────────────
# 📈 Plot PCA Clusters & Highlight Top Proteins
# ───────────────────────────────────────────────────────

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="PCA1", y="PCA2", hue="Cluster", palette="Set2", s=80)
sns.scatterplot(data=top_proteins, x="PCA1", y="PCA2", color="red", s=150, marker="*", label="Top Candidates")
plt.title("Protein Clusters (PCA) with Top Candidates Highlighted")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

!pip install openai
!pip install biopython

# ╔══════════════════════════════════════════════╗
# ║     🧠 Multi-Agent Protein Generator v2      ║
# ╚══════════════════════════════════════════════╝

from openai import OpenAI
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd
import math
import uuid
import re

# ───────────────────────────────────────────────────────
# 🔑 OpenAI Client Setup
# ───────────────────────────────────────────────────────

client = OpenAI(api_key="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")  # Replace with your actual key

# ───────────────────────────────────────────────────────
# ⚙️ Configuration
# ───────────────────────────────────────────────────────

protein_type = "Enzyme"
total_length = 100             # Total length of each protein
num_agents = 4                 # Number of agents per protein
segment_length = math.ceil(total_length / num_agents)
num_proteins = 100             # 🔁 Number of proteins to generate

VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")

# ───────────────────────────────────────────────────────
# 🧠 Segment Generator
# ───────────────────────────────────────────────────────

def generate_segment(agent_id: int, protein_type: str, segment_length: int) -> str:
    prompt = f"""
You are Agent {agent_id}, a protein designer. Generate a protein segment of {segment_length} amino acids.
It should be part of a larger {protein_type} protein. Use only standard amino acid 1-letter codes (ACDEFGHIKLMNPQRSTVWY).
Only return the amino acid sequence, no extra commentary.
"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip().upper()

# ───────────────────────────────────────────────────────
# 🧪 Polisher Agent
# ───────────────────────────────────────────────────────

def polish_protein(segments: list, protein_type: str) -> str:
    prompt = f"""
You are a protein engineer. A team of agents created a raw protein of type '{protein_type}' composed of the following segments:

{segments}

Please polish and optimize this full protein sequence for:
- Functionality as a {protein_type}
- Folding compatibility
- Avoiding problematic motifs (like long repeats or instability)

Return ONLY the final 1-letter amino acid sequence.
"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    final_seq = response.choices[0].message.content.strip().upper()
    final_seq = re.sub(r"[^ACDEFGHIKLMNPQRSTVWY]", "", final_seq)  # Clean sequence
    return final_seq

# ───────────────────────────────────────────────────────
# 🧬 Validation + Analysis
# ───────────────────────────────────────────────────────

def analyze_sequence(seq: str) -> dict:
    seq = seq.upper().strip().replace("\n", "")
    invalid = set(seq) - VALID_AMINO_ACIDS
    if invalid:
        raise ValueError(f"Invalid characters in sequence: {invalid}")

    analysis = ProteinAnalysis(seq)
    instability = analysis.instability_index()
    return {
        "Length": len(seq),
        "MW (Da)": round(analysis.molecular_weight(), 2),
        "pI": round(analysis.isoelectric_point(), 2),
        "GRAVY": round(analysis.gravy(), 3),
        "Aromaticity": round(analysis.aromaticity(), 3),
        "Instability": round(instability, 2),
        "Stability": "Stable" if instability < 40 else "Unstable"
    }

# ───────────────────────────────────────────────────────
# 🚀 Generate Proteins in Batch
# ───────────────────────────────────────────────────────

records = []

for i in range(num_proteins):
    print(f"\n🚧 Generating Protein {i+1}/{num_proteins}...\n")
    segments = []

    for agent_id in range(1, num_agents + 1):
        seg = generate_segment(agent_id, protein_type, segment_length)
        segments.append(seg)

    raw_seq = "".join(segments)
    final_seq = polish_protein(segments, protein_type)

    metrics = analyze_sequence(final_seq)
    protein_id = f"P{str(uuid.uuid4())[:8].upper()}"

    record = {
        "Protein ID": protein_id,
        "Sequence": final_seq,
        **metrics
    }

    records.append(record)

# ───────────────────────────────────────────────────────
# 💾 Save to CSV
# ───────────────────────────────────────────────────────

df = pd.DataFrame(records)
df.to_csv("proteins.csv", index=False)
print("\n✅ All proteins generated and saved to 'proteins.csv'.")

# ───────────────────────────────────────────────────────
# 💾 Optional: Save to FASTA
# ───────────────────────────────────────────────────────

with open("proteins.fasta", "w") as fasta_file:
    for rec in records:
        fasta_file.write(f">{rec['Protein ID']}\n{rec['Sequence']}\n")

print("📄 FASTA file also saved as 'proteins.fasta'")

# ╔════════════════════════════════════╗
# ║   🧠 Multi-Agent Protein Builder   ║
# ╚════════════════════════════════════╝

from openai import OpenAI
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math

# ─────────────────────────────────────────────────────────
# 🔑 OpenAI Client Setup (v1+ SDK)
# ─────────────────────────────────────────────────────────

client = OpenAI(api_key="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")  # Replace with your actual key

# ─────────────────────────────────────────────────────────
# ⚙️ Configuration
# ─────────────────────────────────────────────────────────

protein_type = "Enzyme"       # e.g., "Structural", "Hormone", "Enzyme"
total_length = 100            # Total amino acid length
num_agents = 4                # Number of sub-agents
segment_length = math.ceil(total_length / num_agents)

VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")

# ─────────────────────────────────────────────────────────
# 🧠 Agent: Protein Segment Generator
# ─────────────────────────────────────────────────────────

def generate_segment(agent_id: int, protein_type: str, segment_length: int) -> str:
    prompt = f"""
You are Agent {agent_id}, a protein designer. Generate a protein segment of {segment_length} amino acids.
It should be part of a larger {protein_type} protein. Use only standard amino acid 1-letter codes (ACDEFGHIKLMNPQRSTVWY).
Ensure your sequence could realistically fold and interact with other segments.

Only return the amino acid sequence, no extra commentary.
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip().upper()

# ─────────────────────────────────────────────────────────
# 🧪 Final Agent: Protein Polisher
# ─────────────────────────────────────────────────────────

def polish_protein(segments: list, protein_type: str) -> str:
    prompt = f"""
You are a protein engineer. A team of agents created a raw protein of type '{protein_type}' composed of the following segments:

{segments}

Please polish and optimize this full protein sequence for:
- Functionality as a {protein_type}
- Folding compatibility
- Avoiding problematic motifs (like long repeats or instability)

Return ONLY the final 1-letter amino acid sequence.
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )

    final_sequence = response.choices[0].message.content
    final_sequence = final_sequence.strip().upper()

    # Remove any quotes, whitespace, or non-amino-acid characters
    import re
    final_sequence = re.sub(r"[^ACDEFGHIKLMNPQRSTVWY]", "", final_sequence)

    return final_sequence

# ─────────────────────────────────────────────────────────
# 🧬 Biopython: Validate and Analyze Protein Sequence
# ─────────────────────────────────────────────────────────

def validate_and_analyze_sequence(sequence: str, name="Protein"):
    sequence = sequence.replace("\n", "").strip().upper()
    invalids = set(sequence) - VALID_AMINO_ACIDS

    if invalids:
        raise ValueError(f"❌ {name} contains invalid amino acids: {invalids}")

    print(f"\n✅ {name} is valid. Length: {len(sequence)} amino acids")

    analysis = ProteinAnalysis(sequence)
    print(f"• Molecular Weight: {analysis.molecular_weight():.2f} Da")
    print(f"• Aromaticity: {analysis.aromaticity():.3f}")
    print(f"• Instability Index: {analysis.instability_index():.2f}")
    print(f"• Isoelectric Point (pI): {analysis.isoelectric_point():.2f}")
    print(f"• Hydrophobicity (GRAVY): {analysis.gravy():.3f}")

# ─────────────────────────────────────────────────────────
# 🚀 Orchestration: Generate → Validate → Polish
# ─────────────────────────────────────────────────────────

print(f"\n🚧 Generating protein using {num_agents} agents...\n")

segments = []
for agent_id in range(1, num_agents + 1):
    seg = generate_segment(agent_id, protein_type, segment_length)
    print(f"🔹 Agent {agent_id} Segment ({len(seg)} aa): {seg}")
    validate_and_analyze_sequence(seg, name=f"Agent {agent_id} Segment")
    segments.append(seg)

# Combine segments
print("\n🧬 Raw concatenated sequence:")
raw_sequence = "".join(segments)
print(raw_sequence)

# Polish sequence
print("\n🧪 Polishing final protein...")
final_protein = polish_protein(segments, protein_type)

# Final validation
print("\n✅ Final Polished Protein:")
print(final_protein)
validate_and_analyze_sequence(final_protein, name="Final Polished Protein")

# ╔════════════════════════════════════╗
# ║   🧠 Multi-Agent Protein Builder   ║
# ╚════════════════════════════════════╝

import openai
import math
from typing import List
from openai import OpenAI

client = OpenAI(api_key="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")

# Example call:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": prompt}]
)
output = response.choices[0].message.content.strip()


# 🔑 API key
openai.api_key =

# ════════════════════════════════════════════════
# 🔧 CONFIGURATION: Set protein parameters here
# ════════════════════════════════════════════════

protein_type = "Enzyme"      # e.g., "Structural", "Hormone", "Enzyme"
total_length = 100           # Total amino acid length
num_agents = 4               # Number of agent segments

# ════════════════════════════════════════════════
# 🧠 AGENT 1-N: Sequence Generators
# ════════════════════════════════════════════════

def generate_segment(agent_id: int, protein_type: str, segment_length: int) -> str:
    prompt = f"""
You are Agent {agent_id}, a protein designer. Generate a protein segment of {segment_length} amino acids.
It should be part of a larger {protein_type} protein. Use only standard amino acid 1-letter codes (ACDEFGHIKLMNPQRSTVWY).
Ensure your sequence could realistically fold and interact with other segments.

Only return the amino acid sequence, no extra commentary.
"""
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content'].strip()

# ════════════════════════════════════════════════
# 🧪 FINAL AGENT: Polisher
# ════════════════════════════════════════════════

def polish_protein(segments: List[str], protein_type: str) -> str:
    raw_sequence = "".join(segments)
    prompt = f"""
You are a protein engineer. A team of agents created a raw protein of type '{protein_type}' composed of the following segments:

{segments}

Please polish and optimize this full protein sequence for:
- Functionality as a {protein_type}
- Folding compatibility
- Avoiding problematic motifs (like long repeats or instability)

Return ONLY the final 1-letter amino acid sequence.
"""
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content'].strip()

# ════════════════════════════════════════════════
# 🔁 MAIN LOGIC: Multi-Agent Orchestration
# ════════════════════════════════════════════════

segment_length = math.ceil(total_length / num_agents)
segments = []

print(f"Generating protein with {num_agents} agents...\n")

for agent_id in range(1, num_agents + 1):
    seg = generate_segment(agent_id, protein_type, segment_length)
    print(f"🔹 Agent {agent_id} Segment ({len(seg)} aa): {seg}")
    segments.append(seg)

print("\n🧬 Raw sequence (before polish):")
print("".join(segments))

# Final polishing
print("\n🧪 Polishing full protein...")
final_protein = polish_protein(segments, protein_type)

print("\n✅ Final Protein Sequence:")
print(final_protein)
print(f"\n🔬 Final Length: {len(final_protein)} amino acids")